Exploring Study Sites

Load necessary packages

library(caret) # Classification and Regression Training
## Warning: package 'caret' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.2
library(corrplot)
## corrplot 0.84 loaded
library(dplyr) # A Grammar of Data Manipulations
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats) # Tools for Working with Categorical Variables (Factors)
## Warning: package 'forcats' was built under R version 3.5.2
library(gdata) # Various R Programming Tools for Data Manipulation
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
## 
##     combine, first, last
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
library(ggplot2) # Create Elegant Data Visualizations Using the Grammar of Graphics
library(ggrepel) # Automatically Position Non-Overlapping Text Labels with ‘ggplot2’
## Warning: package 'ggrepel' was built under R version 3.5.2
library(ggridges) # Ridgeline Plots in ‘ggplot2’
## 
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
## 
##     scale_discrete_manual
library(ggthemes) # Extra Themes, Scales, and Geoms for ‘ggplot2’
## Warning: package 'ggthemes' was built under R version 3.5.2
library(gmodels) # Various R Programming Tools for Model Fitting
library(grid) # The Grid Graphics Package
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:gdata':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
library(knitr) # A General-Purpose Package for Dynamic Report Generation in R
## Warning: package 'knitr' was built under R version 3.5.2
library(lmtest) # Testing Linear Regression Models
## Warning: package 'lmtest' was built under R version 3.5.2
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(plyr) # Tools for Splitting, Applying and Combining Data
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(purrr) # Functional Programming Tools
## Warning: package 'purrr' was built under R version 3.5.2
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
## 
##     compact
## The following object is masked from 'package:gdata':
## 
##     keep
## The following object is masked from 'package:caret':
## 
##     lift
library(readr) # Read Rectangular Text Data
library(rpart)
## Warning: package 'rpart' was built under R version 3.5.2
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(scales) # Scale Functions for Visualization
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
## The following object is masked from 'package:purrr':
## 
##     discard
library(shiny) # Web Application Framework for R
## Warning: package 'shiny' was built under R version 3.5.2
library(skimr) # Compact and Flexible Summaries of Data
## Warning: package 'skimr' was built under R version 3.5.2
## 
## Attaching package: 'skimr'
## The following object is masked from 'package:knitr':
## 
##     kable
## The following object is masked from 'package:stats':
## 
##     filter
library(stringr) # Simple, Consistent Wrappers for Common String Operations
library(tibble) # Simple Data Frames
## Warning: package 'tibble' was built under R version 3.5.2
library(tidyr) # Easily Tidy Data with ‘spread()’ and ‘gather()’ Functions
## Warning: package 'tidyr' was built under R version 3.5.2
library(tidyverse) # Install and Load ‘Tidyverse’ Packages
library(usmap)
## Warning: package 'usmap' was built under R version 3.5.2
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.2
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
## The following object is masked from 'package:plyr':
## 
##     ozone
library(mapdata)
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.5.2
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggmap':
## 
##     theme_nothing
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
library("googleway")
library("ggspatial") 
library("rnaturalearth") 
library("rnaturalearthdata")
theme_set(theme_bw())
library("sf")
## Warning: package 'sf' was built under R version 3.5.2
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
library(gridExtra)

world <- ne_countries(scale = "medium", returnclass = "sf") #for mapping

Load cleaned data

clean_data <- readRDS("../../data/processed_data/processeddata.rds")
skim(clean_data)
## Skim summary statistics
##  n obs: 13165 
##  n variables: 23 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────────────
##               variable missing complete     n n_unique
##              bioregion       0    13165 13165        6
##              georegion       0    13165 13165        9
##             group_code       0    13165 13165        9
##  group_code_UCSC_other       0    13165 13165        2
##                 island       0    13165 13165        6
##     marine_season_code       0    13165 13165       57
##       marine_site_name       0    13165 13165       77
##            method_code       0    13165 13165        4
##   method_code_IP_other       0    13165 13165        2
##             mpa_region       0    13165 13165        6
##            season_name       0    13165 13165        4
##              site_code       0    13165 13165       77
##           species_code       0    13165 13165        3
##                  state       0    13165 13165        4
##                                              top_counts ordered
##   San: 5608, Oly: 4138, Gov: 2366, Sal: 652               FALSE
##   CA : 5382, CA : 2592, CA : 1660, OR: 1386               FALSE
##    UCS: 9657, UCL: 2032, PBN: 400, SSS: 366               FALSE
##                 UCS: 9657, Oth: 3508, NA: 0               FALSE
##    Mai: 12364, Bar: 279, Sad: 250, Hat: 150               FALSE
##      SU1: 440, SU1: 403, SU1: 392, FA1: 376               FALSE
##      Mil: 524, Sco: 513, Sti: 472, End: 468               FALSE
##      IP: 12012, BT2: 782, GSE: 336, TS3: 35               FALSE
##                 IP: 12012, Oth: 1153, NA: 0               FALSE
##  Cen: 5382, Sou: 2627, NUL: 1932, Nor: 1660               FALSE
##    Sum: 4552, Fal: 4489, Spr: 4112, Win: 12               FALSE
##      MCR: 524, SCT: 513, SWC: 472, END: 468               FALSE
##                                           P.o: 11416, K   FALSE
##       CA: 10548, OR: 1386, WA: 865, AK: 366               FALSE
## 
## ── Variable type:integer ───────────────────────────────────────────────────────────────
##              variable missing complete     n    mean      sd   p0  p25
##  marine_common_season       0    13165 13165  116.82   19.23   78  101
##    marine_common_year       0    13165 13165 2009.7     4.81 2000 2006
##     marine_sort_order       0    13165 13165 5566.24 1125.15 1720 5020
##       season_sequence       0    13165 13165    2.03    0.81    1    1
##              size_bin       0    13165 13165   82.44   42.2     5   50
##       size_sort_order       0    13165 13165    9.57    4.15    1    6
##                 total       0    13165 13165    9.92   16.43    1    2
##   p50  p75 p100     hist
##   118  131  152 ▅▅▆▇▇▇▇▅
##  2010 2013 2018 ▃▅▅▇▇▆▆▆
##  6090 6370 7650 ▁▁▁▂▃▃▇▁
##     2    3    4 ▇▁▇▁▁▇▁▁
##    80  110  320 ▅▇▇▃▁▁▁▁
##    10   12   33 ▅▇▇▃▁▁▁▁
##     4   11  360 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────────────
##   variable missing complete     n    mean   sd      p0     p25     p50
##   latitude       0    13165 13165   38.67 5.2    32.87   34.73   36.62
##  longitude       0    13165 13165 -122.25 2.86 -135.38 -123.98 -121.94
##      p75    p100     hist
##    41.65   57.05 ▇▆▃▂▁▁▁▁
##  -120.62 -117.25 ▁▁▁▁▅▇▆▃

Overall histogram of counts per sample survey

clean_data %>% ggplot() + 
  geom_histogram(aes(total))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Create new data frame that just lists each site once

unique_sites <- subset(clean_data, !duplicated(clean_data$site_code)) 
#names(unique_sites)
sites_coordinates <- unique_sites[c(2,5:6,23)]
sites_coordinates_contig <- filter(sites_coordinates, sites_coordinates$state != "Alaska") 
#sites_coordinates
CA_coordinates <- filter(sites_coordinates, sites_coordinates$state == "CA")
OR_coordinates <- filter(sites_coordinates, sites_coordinates$state == "OR")
WA_coordinates <- filter(sites_coordinates, sites_coordinates$state == "WA")
AK_coordinates <- filter(sites_coordinates, sites_coordinates$state == "AK")

CA_coordinates <- as.data.frame(CA_coordinates)
OR_coordinates <- as.data.frame(OR_coordinates)
WA_coordinates <- as.data.frame(WA_coordinates)
AK_coordinates <- as.data.frame(AK_coordinates)

Plot sample sites

After messing around with several different mapping tools in R, I decided that trying to display the whole range (CA to AK) on one map would be really hard, and the Alaska sites are really just at one specific location, so I’m going to split Alaska off from the three states that are part of the contiguous US

Plot sites on state maps

Alaska:

world <- ne_countries(scale = "medium", returnclass = "sf")

AK_map <- ggplot(data = world) +
    geom_sf() +
  geom_point(data= sites_coordinates,aes(x=sites_coordinates$longitude, y=sites_coordinates$latitude,), size = 4, alpha = 0.4,  color ="orange") +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-168, -131), ylim = c(51, 73), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude") +
  geom_text(aes(x = -151, y = 66, label = "ALASKA", size = 600)) +
  geom_text(aes(x = -149.9, y = 61.4, label = "*", size = 600)) +
  geom_text(aes(x = -154, y = 62.6, label = "Anchorage")) +
  scale_x_continuous(breaks = c(-160, -150, -140)) + 
  theme(legend.position = "none")

AK_map 
## Scale on map varies by more than 10%, scale bar may be inaccurate

ggsave(filename = "../../results/AK_sites_map.png",plot = AK_map, width = 3.5, height = 4) 
## Scale on map varies by more than 10%, scale bar may be inaccurate
State <- sites_coordinates_contig$state

CA_map <- ggplot(data = world) +
  geom_sf() +
  geom_jitter(data= sites_coordinates_contig,aes(x=sites_coordinates_contig$longitude, y=sites_coordinates_contig$latitude,col=State), size = 3, alpha = 0.6, width = 0.2) +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-126, -116), ylim = c(32, 42.4), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude")+ 
  geom_text(aes(x = -122.7, y = 37.7, label = ">")) + 
  geom_text(aes(x = -123.5, y = 37.6, label = "San ")) +
  geom_text(aes(x = -124.2, y = 37.1, label = "Francisco")) +
  geom_text(aes(x = -120, y = 39, label = "CALIFORNIA")) + 
  geom_text(aes(x = -118.9, y = 33.7, label = ">")) + 
  geom_text(aes(x = -120.9, y = 33.6, label = "Los Angeles")) +
  theme(legend.position = "none") + 
  scale_x_continuous(breaks = c(-124, -122, -120, -118))

CA_map
## Scale on map varies by more than 10%, scale bar may be inaccurate

ggsave(filename = "../../results/CA_sites_map.png",plot = CA_map, width = 3.5, height = 4) 
## Scale on map varies by more than 10%, scale bar may be inaccurate
OR_map <- ggplot(data = world) +
    geom_sf() +
  geom_jitter(data= sites_coordinates_contig,aes(x=sites_coordinates_contig$longitude, y=sites_coordinates_contig$latitude,col=State), size = 3, alpha = 0.6, width = 0.2) +
  annotation_scale(location = "br", width_hint = 0.3) +
  coord_sf(xlim = c(-127.5, -120.8), ylim = c(41.5, 47.5), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude") + 
  geom_text(aes(x = -122.4, y = 44, label = "OREGON")) +
  geom_text(aes(x = -122.8, y = 45.6, label = "*")) + 
  geom_text(aes(x = -122.8, y = 45.4, label = "Portland")) + 
  theme(legend.position = "none") +
  scale_x_continuous(breaks = c(-126, -124, -122))

OR_map
## Scale on map varies by more than 10%, scale bar may be inaccurate

ggsave(filename = "../../results/OR_sites_map.png",plot = OR_map, width = 3.5, height = 4) 
## Scale on map varies by more than 10%, scale bar may be inaccurate
WA_map <- ggplot(data = world) +
    geom_sf() +
  geom_jitter(data= sites_coordinates_contig,aes(x=sites_coordinates_contig$longitude, y=sites_coordinates_contig$latitude,col=State), size = 3, alpha = 0.6, width = 0.2) +
  annotation_scale(location = "br", width_hint = 0.3) +
  coord_sf(xlim = c(-125.2, -121.3), ylim = c(45.8, 49.1), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude") + 
  geom_text(aes(x = -122.5, y = 46.7, label = "WASHINGTON")) +
  geom_text(aes(x = -123.7, y = 47.93, label = "Olympic")) +
  geom_text(aes(x = -123.7, y = 47.75, label = "Peninsula")) +
  geom_text(aes(x = -121.9, y = 47.75, label = "< Seattle")) +
  theme(legend.position = "none") + 
  scale_x_continuous(breaks = c(-124, -123, -122))

WA_map 

ggsave(filename = "../../results/WA_sites_map.png",plot = WA_map, width = 3.5, height = 4) 
ggplot(data = world) +
    geom_sf() +
  geom_jitter(data= sites_coordinates_contig,aes(x=sites_coordinates_contig$longitude, y=sites_coordinates_contig$latitude,col=State), size = 3, alpha = 0.6, width = 0.2) +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-128, -115), ylim = c(32, 49.5), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude")+ geom_text(aes(x = -121, y = 38, label = "< San"))
## Scale on map varies by more than 10%, scale bar may be inaccurate

Exploring Count Data by State and Year Surveyed

What does the structure of the data look like by state?

CA

CA <- clean_data %>% select(marine_site_name, marine_common_year, size_bin, total, state) %>%
  filter(state == "CA")
summary(CA)
##       marine_site_name marine_common_year    size_bin     
##  Mill Creek   : 524    Min.   :2000       Min.   :  5.00  
##  Scott Creek  : 513    1st Qu.:2005       1st Qu.: 50.00  
##  Stillwater   : 472    Median :2009       Median : 80.00  
##  Enderts      : 468    Mean   :2009       Mean   : 81.92  
##  Terrace Point: 447    3rd Qu.:2013       3rd Qu.:110.00  
##  Andrew Molera: 443    Max.   :2018       Max.   :230.00  
##  (Other)      :7681                                       
##      total         state     
##  Min.   :  1.000   AK:    0  
##  1st Qu.:  2.000   CA:10548  
##  Median :  5.000   OR:    0  
##  Mean   :  9.607   WA:    0  
##  3rd Qu.: 11.000             
##  Max.   :360.000             
## 

OR

OR <- clean_data %>% select(marine_site_name, marine_common_year, size_bin, total, state) %>%
  filter(state == "OR")
summary(OR)
##       marine_site_name marine_common_year    size_bin     
##  Fogarty Creek:351     Min.   :2000       Min.   :  5.00  
##  Bob Creek    :334     1st Qu.:2005       1st Qu.: 50.00  
##  Burnt Hill   :260     Median :2010       Median : 80.00  
##  Ecola        :228     Mean   :2010       Mean   : 87.21  
##  Cape Arago   :213     3rd Qu.:2014       3rd Qu.:120.00  
##  Alegria      :  0     Max.   :2018       Max.   :230.00  
##  (Other)      :  0                                        
##      total        state    
##  Min.   :  1.00   AK:   0  
##  1st Qu.:  2.00   CA:   0  
##  Median :  7.00   OR:1386  
##  Mean   : 15.28   WA:   0  
##  3rd Qu.: 18.00            
##  Max.   :212.00            
## 

WA

WA <- clean_data %>% select(marine_site_name, marine_common_year, size_bin, total, state) %>%
  filter(state == "WA")
summary(WA)
##              marine_site_name marine_common_year    size_bin     
##  Saddlebag North Cove:163     Min.   :2008       Min.   :  5.00  
##  Post Point          :151     1st Qu.:2011       1st Qu.: 50.00  
##  Hat Island West     :116     Median :2014       Median : 90.00  
##  Point Grenville     :111     Mean   :2014       Mean   : 89.26  
##  Kydikabbit Point    :102     3rd Qu.:2016       3rd Qu.:120.00  
##  Saddlebag South East: 87     Max.   :2018       Max.   :320.00  
##  (Other)             :135                                        
##      total         state   
##  Min.   :  1.000   AK:  0  
##  1st Qu.:  1.000   CA:  0  
##  Median :  2.000   OR:  0  
##  Mean   :  6.776   WA:865  
##  3rd Qu.:  5.000           
##  Max.   :133.000           
## 

AK

AK <- clean_data %>% select(marine_site_name, marine_common_year, size_bin, total, state) %>%
  filter(state == "AK")
summary(AK)
##       marine_site_name marine_common_year    size_bin     
##  Sage Rock    :148     Min.   :2011       Min.   :  5.00  
##  Pirates Cove :131     1st Qu.:2013       1st Qu.: 40.00  
##  Kayak Island : 87     Median :2015       Median : 60.00  
##  Alegria      :  0     Mean   :2015       Mean   : 63.51  
##  Andrew Molera:  0     3rd Qu.:2017       3rd Qu.: 87.50  
##  Arroyo Hondo :  0     Max.   :2018       Max.   :190.00  
##  (Other)      :  0                                        
##      total        state   
##  Min.   :  1.00   AK:366  
##  1st Qu.:  1.00   CA:  0  
##  Median :  3.00   OR:  0  
##  Mean   :  6.03   WA:  0  
##  3rd Qu.:  7.00           
##  Max.   :117.00           
## 

[INSERT TABLE SUMMARIZING THIS INFORMATION]


What does sea star abundance over time look like?

clean_data %>% ggplot(aes(marine_common_year, total)) +  geom_jitter(width = 0.15, alpha = 0.25)

Lets color the points by state

clean_data %>% ggplot(aes(marine_common_year, total, col=state)) +  geom_jitter(width = 0.15, alpha = 0.25)

Woah, California is dominating this visualization.

Lets break that up into 4 parts:

clean_data %>%   
  ggplot(aes(marine_common_year, total, col=state)) +  geom_jitter(width = 0.15, alpha = 0.25) +
  facet_wrap(~state)

Washington and Alaska clearly weren’t surveyed for all years during which California and Oregon were.

What do these counts look like if we display them as log()?

clean_data %>%   
  ggplot(aes(marine_common_year, log(total), col=state)) +  geom_jitter(width = 0.15, alpha = 0.25) +
  facet_wrap(~state)


Abundance over time

clean_data %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + geom_boxplot()


So far I’ve been looking at this across all 3 species of sea stars surveyed. What does the last graph look like for each species, seperately?

Summary statistics for each species over time

P.ochra_only <- filter(clean_data, species_code == "P.ochraceus")
K.tuni_only <- filter(clean_data, species_code == "K.tunicata")
E.tros_only <- filter(clean_data, species_code == "E.troschelii")
skim(P.ochra_only)
## Skim summary statistics
##  n obs: 11416 
##  n variables: 23 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────────────
##               variable missing complete     n n_unique
##              bioregion       0    11416 11416        6
##              georegion       0    11416 11416        9
##             group_code       0    11416 11416        9
##  group_code_UCSC_other       0    11416 11416        2
##                 island       0    11416 11416        6
##     marine_season_code       0    11416 11416       57
##       marine_site_name       0    11416 11416       77
##            method_code       0    11416 11416        4
##   method_code_IP_other       0    11416 11416        2
##             mpa_region       0    11416 11416        6
##            season_name       0    11416 11416        4
##              site_code       0    11416 11416       77
##           species_code       0    11416 11416        1
##                  state       0    11416 11416        4
##                                              top_counts ordered
##   San: 5086, Oly: 3232, Gov: 2366, Sal: 492               FALSE
##   CA : 4860, CA : 2592, CA : 1271, OR: 1138               FALSE
##    UCS: 8284, UCL: 2032, PBN: 291, CSU: 283               FALSE
##                 UCS: 8284, Oth: 3132, NA: 0               FALSE
##     Mai: 10885, Sad: 192, Bar: 151, Hat: 99               FALSE
##      FA1: 323, FA0: 313, SP0: 313, FA1: 312               FALSE
##      Sco: 453, Ter: 446, Haz: 404, Mil: 404               FALSE
##      IP: 10263, BT2: 782, GSE: 336, TS3: 35               FALSE
##                 IP: 10263, Oth: 1153, NA: 0               FALSE
##  Cen: 4860, Sou: 2627, NUL: 1414, Nor: 1271               FALSE
##    Fal: 4157, Spr: 3786, Sum: 3461, Win: 12               FALSE
##      SCT: 453, TPT: 446, HAZ: 404, MCR: 404               FALSE
##                                           P.o: 11416, E   FALSE
##        CA: 9433, OR: 1138, WA: 640, AK: 205               FALSE
## 
## ── Variable type:integer ───────────────────────────────────────────────────────────────
##              variable missing complete     n    mean      sd   p0  p25
##  marine_common_season       0    11416 11416  114.4    19.14   78   99
##    marine_common_year       0    11416 11416 2009.09    4.79 2000 2005
##     marine_sort_order       0    11416 11416 5682.02 1048.93 1720 5050
##       season_sequence       0    11416 11416    2.03    0.84    1    1
##              size_bin       0    11416 11416   86.83   42.67    5   50
##       size_sort_order       0    11416 11416    9.68    4.27    1    6
##                 total       0    11416 11416   10.85   17.41    1    2
##   p50  p75 p100     hist
##   114  130  152 ▅▅▆▇▆▆▅▃
##  2009 2013 2018 ▃▅▅▇▆▆▃▅
##  6150 6390 7650 ▁▁▁▂▃▂▇▁
##     2    3    4 ▇▁▆▁▁▇▁▁
##    90  120  320 ▅▇▇▅▁▁▁▁
##    10   13   33 ▅▇▇▅▁▁▁▁
##     5   13  360 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────────────
##   variable missing complete     n    mean   sd      p0     p25     p50
##   latitude       0    11416 11416   38.13 4.82   32.87   34.55   36.51
##  longitude       0    11416 11416 -121.94 2.57 -135.38 -123.51 -121.91
##      p75    p100     hist
##    40.34   57.05 ▇▅▂▂▁▁▁▁
##  -120.61 -117.25 ▁▁▁▁▅▇▇▅
skim(K.tuni_only)
## Skim summary statistics
##  n obs: 1676 
##  n variables: 23 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────────────
##               variable missing complete    n n_unique
##              bioregion       0     1676 1676        4
##              georegion       0     1676 1676        7
##             group_code       0     1676 1676        6
##  group_code_UCSC_other       0     1676 1676        2
##                 island       0     1676 1676        5
##     marine_season_code       0     1676 1676       27
##       marine_site_name       0     1676 1676       46
##            method_code       0     1676 1676        1
##   method_code_IP_other       0     1676 1676        1
##             mpa_region       0     1676 1676        5
##            season_name       0     1676 1676        3
##              site_code       0     1676 1676       46
##           species_code       0     1676 1676        1
##                  state       0     1676 1676        4
##                                         top_counts ordered
##  Oly: 906, San: 522, Sal: 126, AK_: 122              FALSE
##   CA : 522, CA : 389, OR: 248, CA : 204              FALSE
##   UCS: 1371, SSS: 122, PBN: 94, OCN: 65              FALSE
##              UCS: 1371, Oth: 305, NA: 0              FALSE
##    Mai: 1460, Bar: 93, Hat: 48, Sad: 46              FALSE
##  SU1: 131, SU1: 131, SU1: 129, SU1: 128              FALSE
##  Fal: 144, Mil: 120, Sti: 110, End: 105              FALSE
##        IP: 1676, BT2: 0, GSE: 0, TS3: 0              FALSE
##                 IP: 1676, Oth: 0, NA: 0              FALSE
##  Cen: 522, NUL: 445, Nor: 389, Nor: 204              FALSE
##   Sum: 1023, Fal: 331, Spr: 322, Win: 0              FALSE
##  FKC: 144, MCR: 120, SWC: 110, END: 105              FALSE
##                                       K.t: 1676, E   FALSE
##     CA: 1115, OR: 248, WA: 191, AK: 122              FALSE
## 
## ── Variable type:integer ───────────────────────────────────────────────────────────────
##              variable missing complete    n    mean      sd   p0  p25  p50
##  marine_common_season       0     1676 1676  132.29   10.05  114  123  133
##    marine_common_year       0     1676 1676 2013.57    2.54 2009 2011 2014
##     marine_sort_order       0     1676 1676 4911.32 1220.64 1720 4400 5050
##       season_sequence       0     1676 1676    2.01    0.62    1    2    2
##              size_bin       0     1676 1676   53.79   23.85    5   35   50
##       size_sort_order       0     1676 1676    8.95    3.05    1    7    9
##                 total       0     1676 1676    3.9     3.26    1    1    3
##   p75 p100     hist
##   141  150 ▆▇▅▆▇▇▅▆
##  2016 2018 ▇▇▅▆▇▇▆▇
##  6130 6310 ▂▁▁▁▂▇▁▆
##     2    3 ▂▁▁▇▁▁▁▂
##    70  130 ▃▃▇▅▇▂▁▁
##    11   17 ▂▂▃▇▇▅▁▁
##     5   19 ▇▂▂▁▁▁▁▁
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────────────
##   variable missing complete    n    mean   sd      p0     p25     p50
##   latitude       0     1676 1676   41.73 5.77   35.45   36.56   40.34
##  longitude       0     1676 1676 -124.07 3.33 -135.38 -124.14 -123.73
##      p75    p100     hist
##    44.24   57.05 ▇▅▆▂▃▁▁▂
##  -121.95 -120.95 ▁▁▁▁▁▁▇▆
skim(E.tros_only)
## Skim summary statistics
##  n obs: 73 
##  n variables: 23 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────────────
##               variable missing complete  n n_unique
##              bioregion       0       73 73        2
##              georegion       0       73 73        2
##             group_code       0       73 73        5
##  group_code_UCSC_other       0       73 73        2
##                 island       0       73 73        5
##     marine_season_code       0       73 73       10
##       marine_site_name       0       73 73       10
##            method_code       0       73 73        1
##   method_code_IP_other       0       73 73        1
##             mpa_region       0       73 73        1
##            season_name       0       73 73        3
##              site_code       0       73 73       10
##           species_code       0       73 73        1
##                  state       0       73 73        2
##                                   top_counts ordered
##    AK_: 39, Sal: 34, Cha: 0, Gov: 0            FALSE
##     AK: 39, WA : 34, CA : 0, CA : 0            FALSE
##    SSS: 39, UW: 16, PBN: 15, UCS: 2            FALSE
##              Oth: 71, UCS: 2, NA: 0            FALSE
##   Bar: 35, Mai: 19, Sad: 12, Kay: 4            FALSE
##   SU1: 24, SU1: 13, SU1: 12, SU1: 9            FALSE
##  Sag: 19, Pir: 16, Man: 13, Sad: 10            FALSE
##      IP: 73, BT2: 0, GSE: 0, TS3: 0            FALSE
##               IP: 73, Oth: 0, NA: 0            FALSE
##     NUL: 73, Cen: 0, Nor: 0, Nor: 0            FALSE
##     Sum: 68, Spr: 4, Fal: 1, Win: 0            FALSE
##  SAG: 19, PIR: 16, MAN: 13, SBN: 10            FALSE
##                                   E.t: 73, K   FALSE
##        AK: 39, WA: 34, CA: 0, OR: 0            FALSE
## 
## ── Variable type:integer ───────────────────────────────────────────────────────────────
##              variable missing complete  n    mean     sd   p0  p25  p50
##  marine_common_season       0       73 73  140.75   9.19  114  134  142
##    marine_common_year       0       73 73 2015.7    2.28 2009 2014 2016
##     marine_sort_order       0       73 73 2496.03 833.39 1720 1720 1740
##       season_sequence       0       73 73    1.96   0.26    1    2    2
##              size_bin       0       73 73   54.11  28.41    5   30   50
##       size_sort_order       0       73 73    6.4    2.87    1    4    6
##                 total       0       73 73    1.84   1.83    1    1    1
##   p75 p100     hist
##   150  150 ▁▁▁▂▃▁▃▇
##  2018 2018 ▁▁▁▂▃▁▃▇
##  3290 3850 ▇▁▁▁▁▃▃▁
##     2    3 ▁▁▁▇▁▁▁▁
##    70  160 ▅▇▆▅▅▁▁▁
##     8   17 ▅▇▆▅▅▁▁▁
##     2   13 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────────────
##   variable missing complete  n    mean  sd      p0     p25     p50     p75
##   latitude       0       73 73   52.86 4.5   47.58   48.52   56.99   57.05
##  longitude       0       73 73 -129.42 6.4 -135.38 -135.35 -135.32 -122.55
##     p100     hist
##    57.05 ▇▁▁▁▁▁▁▇
##  -122.52 ▇▁▁▁▁▁▁▇

Species abundance over time

P.ochra_only %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + geom_boxplot()

K.tuni_only %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + geom_boxplot()

E.tros_only %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + geom_boxplot()

That last one looked like the log() function might not be applicable. Let’s see what it looks like without that:

E.tros_only %>% ggplot(aes(marine_common_year, total, group = marine_common_year)) + geom_boxplot()


Characterizing sea star abundance in the last decade

The most recent and catastrophic sea star wasting disease outbreak hit the west coast of North America around 2013. How did sea star abundance look before the epidemic, how did it change during the epidemic, and have the populations rebounded?

last_decade <- filter(clean_data, marine_common_year >= "2010")
last_decade %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + geom_boxplot()

It looks like populations leveled back out at 2016 and remained relatively unchanged in 2017 and 2018.


Narrowing the time frame under analysis

It looks like theres a detectable decrease between 2013 and 2014, and that populations leveled back out at 2016 and remained relatively unchanged in 2017 and 2018. Lets narrow the time frame to only include sampling between 2013 and 2016.

last_decade$marine_common_year <- as.factor(last_decade$marine_common_year)
SSWD_timeframe <- filter(clean_data, marine_common_year >= "2013" & marine_common_year <= "2016")
SSWD_timeframe$marine_common_year <- as.factor(SSWD_timeframe$marine_common_year)
SSWD_timeframe %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year, col=marine_common_year)) + geom_boxplot()

By state

SSWD_timeframe %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year, col=marine_common_year)) + geom_boxplot() +   facet_wrap(~state, 1) + theme(legend.position = "none")


Fitting a linear model

SSWD_timeframe %>% ggplot(aes(marine_common_year, log(total), col=state)) +  geom_jitter(width = 0.45, alpha = 0.25) +
  facet_wrap(~state, 1) + geom_smooth(method = "lm", col="black") + theme(legend.position = "none")

Just California

SSWD_timeframe_CA <- filter(SSWD_timeframe, state == "CA")
SSWD_timeframe_CA$marine_common_year <- as.factor(SSWD_timeframe_CA$marine_common_year)
SSWD_timeframe_CA %>% ggplot(aes(marine_common_year, log(total), col=marine_common_year)) +  geom_jitter(width = 0.5, alpha = 0.25) + geom_smooth(method = "lm", col="red") + theme(legend.position = "none")


Doing the same as above, but separating by species, with a primary focus on Pisaster ochraceus (the organism I’m studing in my research)

SSWD_timeframe_P.ochra <- filter(P.ochra_only, marine_common_year >= "2013" & marine_common_year <= "2016")

SSWD_timeframe_P.ochra$marine_common_year <- as.factor(SSWD_timeframe_P.ochra$marine_common_year)

SSWD_timeframe_P.ochra %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year, col=marine_common_year)) + geom_boxplot() +   facet_wrap(~state, 1) + theme(legend.position = "none")

SSWD_timeframe_K.tuni <- filter(K.tuni_only, marine_common_year >= "2013" & marine_common_year <= "2016")

SSWD_timeframe_K.tuni$marine_common_year <- as.factor(SSWD_timeframe_K.tuni$marine_common_year)

SSWD_timeframe_K.tuni %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year, col=marine_common_year)) + geom_boxplot() +   facet_wrap(~state, 1) + theme(legend.position = "none")

SSWD_timeframe_E.tros <- filter(E.tros_only, marine_common_year >= "2013" & marine_common_year <= "2016")

SSWD_timeframe_E.tros$marine_common_year <- as.factor(SSWD_timeframe_E.tros$marine_common_year)

SSWD_timeframe_E.tros %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year, col=marine_common_year)) + geom_boxplot() +   facet_wrap(~state, 1) + theme(legend.position = "none")

OH! I didn’t catch that the reason why E.tros had such few survey sightings was because its range appears to only be in Alaska. Speaking of which….


I should go back and plot each survey entry and color by species observed to get an idea of the ranges of each.

Species <- SSWD_timeframe$species_code
broad_jitter <- ggplot(data = world) +
    geom_sf() +
  geom_jitter(data= SSWD_timeframe,aes(x=SSWD_timeframe$longitude, y=SSWD_timeframe$latitude,col=Species), size = 2, alpha = 0.3, width = 3) +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-128, -115), ylim = c(32, 49.5), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude")

narrow_jitter <- ggplot(data = world) +
    geom_sf() +
  geom_jitter(data= SSWD_timeframe,aes(x=SSWD_timeframe$longitude, y=SSWD_timeframe$latitude,col=Species), size = 2, alpha = 0.3, width = 0.3) +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-128, -115), ylim = c(32, 49.5), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude")

gridExtra::grid.arrange(broad_jitter,narrow_jitter,nrow=2)
## Scale on map varies by more than 10%, scale bar may be inaccurate
## Scale on map varies by more than 10%, scale bar may be inaccurate

AK_zoom1_SSWD <- ggplot(data = world) +
    geom_sf() +
  geom_point(data= SSWD_timeframe,aes(x=SSWD_timeframe$longitude, y=SSWD_timeframe$latitude,col=Species,), size = 2, alpha = 0.4) +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-140, -130), ylim = c(55, 62), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude") 
  

AK_zoom2_SSWD <- ggplot(data = world) +
    geom_sf() +
  geom_point(data= SSWD_timeframe,aes(x=SSWD_timeframe$longitude, y=SSWD_timeframe$latitude,col=Species,), size = 2, alpha = 0.4) +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-136, -135), ylim = c(56.6, 57.3), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude") 
  
grid.arrange(AK_zoom1_SSWD, AK_zoom2_SSWD, nrow = 2)
## Scale on map varies by more than 10%, scale bar may be inaccurate